Time Series Analysis

Introduction

This document contains time series analysis for the DSAN5100 final project.

Load data files and preprocess

# Load required libraries for R
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(arrow)

Attaching package: 'arrow'

The following object is masked from 'package:lubridate':

    duration

The following object is masked from 'package:utils':

    timestamp
library(plotly)

Attaching package: 'plotly'

The following object is masked from 'package:arrow':

    schema

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout
library(zoo)

Attaching package: 'zoo'

The following objects are masked from 'package:base':

    as.Date, as.Date.numeric
library(forecast)  
Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 
library(imputeTS)

Attaching package: 'imputeTS'

The following object is masked from 'package:zoo':

    na.locf
library(tseries)

Attaching package: 'tseries'

The following object is masked from 'package:imputeTS':

    na.remove
library(patchwork)
library(astsa)

Attaching package: 'astsa'

The following object is masked from 'package:forecast':

    gas
# Set Arrow option to skip null characters
options(arrow.skip_nul = TRUE)

# Set seed for reproducibility
set.seed(42)
# Load the data

argentina_df <- read_parquet("data/argentina_data_final.parquet")
canada_df <- read_parquet("data/canada_data_final.parquet")
china_df <- read_parquet("data/china_data_final.parquet")
india_df <- read_parquet("data/india_data_final.parquet")
italy_df <- read_parquet("data/italy_data_final.parquet")
russia_df <- read_parquet("data/russia_data_final.parquet")
us_df <- read_parquet("data/us_data_final.parquet")


head(us_df)
# A tibble: 6 × 16
     ID publishedAt    `source-name` location_code location category  year month
  <int> <chr>          <chr>         <chr>         <chr>    <chr>    <int> <int>
1   816 2020-08-07T11… Minneapolis … us            United … general   2020     8
2   901 2020-08-07T11… Google News   us            United … general   2020     8
3   961 2020-08-07T11… CNBC          us            United … general   2020     8
4  1054 2020-08-07T12… Google News   us            United … general   2020     8
5  1147 2020-08-07T12… CNET          us            United … general   2020     8
6  1335 2020-08-07T13… The Washingt… us            United … general   2020     8
# ℹ 8 more variables: new_title <chr>, neg <dbl>, neu <dbl>, pos <dbl>,
#   compound <dbl>, sentiment_category <chr>, bias_category <chr>,
#   bias_score <dbl>
# Change publishedAt column into a date type

# Function to convert publishedAt to date type
convert_date_column <- function(df) {
  df <- df %>%
    mutate(publishedAt = as.Date(publishedAt))
  return(df)
}

# Apply date conversion to all dataframes
argentina_df <- convert_date_column(argentina_df)
canada_df <- convert_date_column(canada_df)
china_df <- convert_date_column(china_df)
india_df <- convert_date_column(india_df)
italy_df <- convert_date_column(italy_df)
russia_df <- convert_date_column(russia_df)
us_df <- convert_date_column(us_df)

# Check the data type conversion
cat("Date type conversion completed:\n")
Date type conversion completed:
cat("publishedAt column type for US data:", class(us_df$publishedAt), "\n")
publishedAt column type for US data: Date 
# Check date range for various country data

cat("Date range for US data:", as.character(range(us_df$publishedAt, na.rm = TRUE)), "\n")
Date range for US data: 2015-04-01 2021-11-29 
cat("Date range for Argentina data:", as.character(range(argentina_df$publishedAt, na.rm = TRUE)), "\n")
Date range for Argentina data: 2016-10-02 2021-11-29 
cat("Date range for Canada data:", as.character(range(canada_df$publishedAt, na.rm = TRUE)), "\n")
Date range for Canada data: 2013-12-26 2021-11-29 
cat("Date range for China data:", as.character(range(china_df$publishedAt, na.rm = TRUE)), "\n")
Date range for China data: 2019-11-23 2021-11-29 
cat("Date range for India data:", as.character(range(india_df$publishedAt, na.rm = TRUE)), "\n")
Date range for India data: 2015-04-25 2021-11-29 
cat("Date range for Italy data:", as.character(range(italy_df$publishedAt, na.rm = TRUE)), "\n")
Date range for Italy data: 2016-07-10 2021-11-29 
cat("Date range for Russia data:", as.character(range(russia_df$publishedAt, na.rm = TRUE)), "\n")
Date range for Russia data: 2010-11-24 2021-11-29 

All of the countries have an end date of 2021-11-29 after which the data was no longer collected. The earliest data for the published articles range from 2010-11-24 for Russia to 2019-11-23 for China

# Combine the country datasets into df_countries

# Set Arrow option to handle null characters
options(arrow.skip_nul = TRUE)

df_countries <- bind_rows(
  us_df, 
  canada_df, 
  china_df, 
  india_df, 
  italy_df, 
  russia_df, 
  argentina_df
)
Warning in vec_rbind(!!!dots, .names_to = .id, .error_call = current_env()):
Stripping '\0' (nul) from character vector

Warning in vec_rbind(!!!dots, .names_to = .id, .error_call = current_env()):
Stripping '\0' (nul) from character vector

Warning in vec_rbind(!!!dots, .names_to = .id, .error_call = current_env()):
Stripping '\0' (nul) from character vector
# Check the combined dataset
cat("Combined dataset dimensions:", nrow(df_countries), "rows,", ncol(df_countries), "columns\n")
Combined dataset dimensions: 796684 rows, 16 columns
cat("Countries included:", unique(df_countries$location), "\n")
Countries included: United States Canada China India Italy Russian Federation Argentina 
# Filter the dataset to start from 2020 for more focused analysis
df_countries_2020 <- df_countries %>% filter(year > 2019)

Analysis Health, General, Science categories combined

In to capture the sentiment around COVID-19 and also leverage the scarse related data in the dataset, we have combined the Health, Science, and General category to explore the sentiment and bias during the COVID-19 pandemic (2020-02 to 2021-11)

# Combined time series plot for sentiment and bias across health, general, and science categories, daily

# Prepare daily combined data for USA with both sentiment and bias for health, general, and science categories
usa_combined_daily <- df_countries %>%
  filter(location == "United States", category %in% c("health", "general", "science", "business")) %>%
  group_by(publishedAt) %>%
  summarise(
    avg_sentiment = mean(compound, na.rm = TRUE),
    avg_bias = mean(bias_score, na.rm = TRUE),
    n_articles = n(),
    .groups = 'drop'
  ) %>%
  rename(date = publishedAt) %>%
  # Transform to long format for plotting
  pivot_longer(
    cols = c(avg_sentiment, avg_bias),
    names_to = "metric",
    values_to = "score"
  ) %>%
  mutate(
    metric = case_when(
      metric == "avg_sentiment" ~ "Sentiment",
      metric == "avg_bias" ~ "Bias"
    )
  )

head(usa_combined_daily)
# A tibble: 6 × 4
  date       n_articles metric     score
  <date>          <int> <chr>      <dbl>
1 2015-04-01          1 Sentiment  0.637
2 2015-04-01          1 Bias       0.973
3 2015-05-30          1 Sentiment  0    
4 2015-05-30          1 Bias      -0.522
5 2015-10-23          1 Sentiment  0    
6 2015-10-23          1 Bias       0.555
# USA daily transformed

# Transform usa_combined_daily to wide format with date, sentiment, bias columns
usa_combined_daily_transformed <- usa_combined_daily %>%
  pivot_wider(
    names_from = metric,
    values_from = score
  ) %>%
  select(date, Sentiment, Bias) %>%
  rename(
    sentiment = Sentiment,
    bias = Bias
  )

head(usa_combined_daily_transformed)
# A tibble: 6 × 3
  date       sentiment   bias
  <date>         <dbl>  <dbl>
1 2015-04-01     0.637  0.973
2 2015-05-30     0     -0.522
3 2015-10-23     0      0.555
4 2015-11-11    -0.459 -0.659
5 2016-05-29     0     -0.894
6 2016-06-28     0      0.615
# Create monthly aggregated data from the original df_countries data
usa_combined_monthly <- df_countries %>%
  filter(location == "United States", category %in% c("health", "general", "science", "business")) %>%
  mutate(year_month = floor_date(publishedAt, "month")) %>%
  group_by(year_month) %>%
  summarise(
    sentiment = mean(compound, na.rm = TRUE),
    bias = mean(bias_score, na.rm = TRUE),
    n_articles = n(),
    .groups = 'drop'
  ) %>%
  select(year_month, sentiment, bias) %>%
  arrange(year_month)

head(usa_combined_monthly)
# A tibble: 6 × 3
  year_month sentiment   bias
  <date>         <dbl>  <dbl>
1 2015-04-01     0.637  0.973
2 2015-05-01     0     -0.522
3 2015-10-01     0      0.555
4 2015-11-01    -0.459 -0.659
5 2016-05-01     0     -0.894
6 2016-06-01     0      0.615
cat("Monthly data range:", as.character(range(usa_combined_monthly$year_month)), "\n")
Monthly data range: 2015-04-01 2021-11-01 
cat("Number of months:", nrow(usa_combined_monthly), "\n")
Number of months: 32 

Fill in missing values

# Fill in missing data using Kalman filter

# Fill in missing data for daily df

# Create a complete date sequence to identify missing days
date_range <- seq(from = min(usa_combined_daily_transformed$date), 
                  to = max(usa_combined_daily_transformed$date), 
                  by = "day")

# Create complete daily dataframe with all dates
usa_daily_complete <- data.frame(date = date_range) %>%
  left_join(usa_combined_daily_transformed, by = "date")

# Check for missing values
cat("Missing values in daily data:\n")
Missing values in daily data:
cat("Sentiment:", sum(is.na(usa_daily_complete$sentiment)), "\n")
Sentiment: 1943 
cat("Bias:", sum(is.na(usa_daily_complete$bias)), "\n")
Bias: 1943 
# Apply Kalman filter to fill missing values
usa_daily_filled <- usa_daily_complete %>%
  mutate(
    sentiment_filled = na_kalman(sentiment, model = "StructTS", smooth = TRUE),
    bias_filled = na_kalman(bias, model = "StructTS", smooth = TRUE)
  ) %>%
  select(date, sentiment = sentiment_filled, bias = bias_filled)

cat("Daily data after Kalman filtering:\n")
Daily data after Kalman filtering:
cat("Total days:", nrow(usa_daily_filled), "\n")
Total days: 2435 
cat("Missing values - Sentiment:", sum(is.na(usa_daily_filled$sentiment)), "\n")
Missing values - Sentiment: 0 
cat("Missing values - Bias:", sum(is.na(usa_daily_filled$bias)), "\n")
Missing values - Bias: 0 
head(usa_daily_filled)
        date sentiment      bias
1 2015-04-01 0.6369000 0.9733290
2 2015-04-02 0.6190398 0.9358947
3 2015-04-03 0.6088177 0.9113319
4 2015-04-04 0.5985956 0.8867690
5 2015-04-05 0.5883736 0.8622062
6 2015-04-06 0.5781515 0.8376433
# Fill in missing data for monthly df

# Create a complete monthly sequence
monthly_range <- seq(from = floor_date(min(usa_combined_monthly$year_month), "month"),
                     to = floor_date(max(usa_combined_monthly$year_month), "month"),
                     by = "month")

# Create complete monthly dataframe with all months
usa_monthly_complete <- data.frame(year_month = monthly_range) %>%
  left_join(usa_combined_monthly, by = "year_month")

# Check for missing values
cat("\nMissing values in monthly data:\n")

Missing values in monthly data:
cat("Sentiment:", sum(is.na(usa_monthly_complete$sentiment)), "\n")
Sentiment: 48 
cat("Bias:", sum(is.na(usa_monthly_complete$bias)), "\n")
Bias: 48 
# Apply Kalman filter to fill missing values (if any)
# Check if we actually have missing values before applying Kalman filter
if(any(is.na(usa_monthly_complete$sentiment)) || any(is.na(usa_monthly_complete$bias))) {
  cat("Applying Kalman filter to fill missing monthly values...\n")
  usa_monthly_filled <- usa_monthly_complete %>%
    mutate(
      sentiment_filled = na_kalman(sentiment, model = "StructTS", smooth = TRUE),
      bias_filled = na_kalman(bias, model = "StructTS", smooth = TRUE)
    ) %>%
    select(year_month, sentiment = sentiment_filled, bias = bias_filled)
} else {
  cat("No missing values found in monthly data - using original data...\n")
  usa_monthly_filled <- usa_monthly_complete %>%
    select(year_month, sentiment, bias)
}
Applying Kalman filter to fill missing monthly values...
cat("Monthly data after Kalman filtering:\n")
Monthly data after Kalman filtering:
cat("Total months:", nrow(usa_monthly_filled), "\n")
Total months: 80 
cat("Missing values - Sentiment:", sum(is.na(usa_monthly_filled$sentiment)), "\n")
Missing values - Sentiment: 0 
cat("Missing values - Bias:", sum(is.na(usa_monthly_filled$bias)), "\n")
Missing values - Bias: 0 
head(usa_monthly_filled)
  year_month    sentiment       bias
1 2015-04-01  0.636900000  0.9733290
2 2015-05-01  0.000000000 -0.5220433
3 2015-06-01  0.206343783  0.4378242
4 2015-07-01  0.135316092  0.3692243
5 2015-08-01  0.064288401  0.3006243
6 2015-09-01 -0.006739289  0.2320244

Conversion of dataframes into time series

# Convert daily df into a time series

# Extract the start date for daily time series
daily_start_date <- as.Date(min(usa_daily_filled$date))
daily_start_year <- as.numeric(format(daily_start_date, "%Y"))
daily_start_yday <- as.numeric(format(daily_start_date, "%j"))

# Create daily time series objects
sentiment_daily_ts <- ts(usa_daily_filled$sentiment, 
                        start = c(daily_start_year, daily_start_yday), 
                        frequency = 365.25)

bias_daily_ts <- ts(usa_daily_filled$bias, 
                   start = c(daily_start_year, daily_start_yday), 
                   frequency = 365.25)

# Display basic info about the time series
cat("Daily Time Series Information:\n")
Daily Time Series Information:
cat("Sentiment TS - Start:", paste(start(sentiment_daily_ts), collapse = ", "), 
    "End:", paste(end(sentiment_daily_ts), collapse = ", "), 
    "Frequency:", frequency(sentiment_daily_ts), "\n")
Sentiment TS - Start: 2015.24640657084 End: 2021.91033538672 Frequency: 365.25 
cat("Bias TS - Start:", paste(start(bias_daily_ts), collapse = ", "), 
    "End:", paste(end(bias_daily_ts), collapse = ", "), 
    "Frequency:", frequency(bias_daily_ts), "\n")
Bias TS - Start: 2015.24640657084 End: 2021.91033538672 Frequency: 365.25 
# Convert monthly df into a time series

# Extract the start date for monthly time series
monthly_start_date <- as.Date(min(usa_monthly_filled$year_month))
monthly_start_year <- as.numeric(format(monthly_start_date, "%Y"))
monthly_start_month <- as.numeric(format(monthly_start_date, "%m"))

# Create monthly time series objects
sentiment_monthly_ts <- ts(usa_monthly_filled$sentiment, 
                          start = c(monthly_start_year, monthly_start_month), 
                          frequency = 12)

bias_monthly_ts <- ts(usa_monthly_filled$bias, 
                     start = c(monthly_start_year, monthly_start_month), 
                     frequency = 12)

# Display basic info about the time series
cat("Monthly Time Series Information:\n")
Monthly Time Series Information:
cat("Sentiment TS - Start:", paste(start(sentiment_monthly_ts), collapse = ", "), 
    "End:", paste(end(sentiment_monthly_ts), collapse = ", "), 
    "Frequency:", frequency(sentiment_monthly_ts), "\n")
Sentiment TS - Start: 2015, 4 End: 2021, 11 Frequency: 12 
cat("Bias TS - Start:", paste(start(bias_monthly_ts), collapse = ", "), 
    "End:", paste(end(bias_monthly_ts), collapse = ", "), 
    "Frequency:", frequency(bias_monthly_ts), "\n")
Bias TS - Start: 2015, 4 End: 2021, 11 Frequency: 12 
# Show first few values of each time series
cat("\nFirst 6 values of sentiment_monthly_ts:\n")

First 6 values of sentiment_monthly_ts:
print(head(sentiment_monthly_ts))
              Apr          May          Jun          Jul          Aug
2015  0.636900000  0.000000000  0.206343783  0.135316092  0.064288401
              Sep
2015 -0.006739289
cat("\nFirst 6 values of bias_monthly_ts:\n")

First 6 values of bias_monthly_ts:
print(head(bias_monthly_ts))
            Apr        May        Jun        Jul        Aug        Sep
2015  0.9733290 -0.5220433  0.4378242  0.3692243  0.3006243  0.2320244

Series Visualization

# Plot daily sentiment

# Create date sequence for plotting
daily_dates <- usa_daily_filled$date

fig_daily_sentiment <- plot_ly(x = daily_dates, y = as.numeric(sentiment_daily_ts), 
                               type = 'scatter', mode = 'lines',
                               line = list(color = 'steelblue', width = 1.5),
                               name = 'Daily Sentiment') %>%
  layout(title = "Daily Sentiment Analysis - USA Combined Categories (Health, General, Science, Business)",
         xaxis = list(title = "Date"),
         yaxis = list(title = "Sentiment Score"),
         hovermode = 'x unified')

fig_daily_sentiment
# Plot monthly sentiment

# Create date sequence for monthly plotting
monthly_dates <- usa_monthly_filled$year_month

fig_monthly_sentiment <- plot_ly(x = monthly_dates, y = as.numeric(sentiment_monthly_ts), 
                                 type = 'scatter', mode = 'lines+markers',
                                 line = list(color = 'darkblue', width = 2),
                                 marker = list(size = 6, color = 'darkblue'),
                                 name = 'Monthly Sentiment') %>%
  layout(title = "Monthly Sentiment Analysis - USA Combined Categories (Health, General, Science, Business)",
         xaxis = list(title = "Month"),
         yaxis = list(title = "Average Sentiment Score"),
         hovermode = 'x unified')

fig_monthly_sentiment
# Plot daily bias

fig_daily_bias <- plot_ly(x = daily_dates, y = as.numeric(bias_daily_ts), 
                          type = 'scatter', mode = 'lines',
                          line = list(color = 'coral', width = 1.5),
                          name = 'Daily Bias') %>%
  layout(title = "Daily Bias Analysis - USA Combined Categories (Health, General, Science, Business)",
         xaxis = list(title = "Date"),
         yaxis = list(title = "Bias Score"),
         hovermode = 'x unified')

fig_daily_bias
# Plot monthly bias

fig_monthly_bias <- plot_ly(x = monthly_dates, y = as.numeric(bias_monthly_ts), 
                            type = 'scatter', mode = 'lines+markers',
                            line = list(color = 'darkred', width = 2),
                            marker = list(size = 6, color = 'darkred'),
                            name = 'Monthly Bias') %>%
  layout(title = "Monthly Bias Analysis - USA Combined Categories (Health, General, Science, Business)",
         xaxis = list(title = "Month"),
         yaxis = list(title = "Average Bias Score"),
         hovermode = 'x unified')

fig_monthly_bias

ADF Test for Stationarity

# ADF Test for stationarity of daily data

cat("=== ADF Test Results for Daily Time Series ===\n\n")
=== ADF Test Results for Daily Time Series ===
# ADF test for daily sentiment
cat("Daily Sentiment ADF Test:\n")
Daily Sentiment ADF Test:
adf_daily_sentiment <- adf.test(sentiment_daily_ts, alternative = "stationary")
print(adf_daily_sentiment)

    Augmented Dickey-Fuller Test

data:  sentiment_daily_ts
Dickey-Fuller = -2.9535, Lag order = 13, p-value = 0.1746
alternative hypothesis: stationary
cat("Interpretation: ")
Interpretation: 
if(adf_daily_sentiment$p.value < 0.05) {
  cat("Series is STATIONARY (p-value < 0.05)\n")
} else {
  cat("Series is NON-STATIONARY (p-value >= 0.05)\n")
}
Series is NON-STATIONARY (p-value >= 0.05)
cat("\n" , rep("-", 50), "\n\n")

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
# ADF test for daily bias
cat("Daily Bias ADF Test:\n")
Daily Bias ADF Test:
adf_daily_bias <- adf.test(bias_daily_ts, alternative = "stationary")
Warning in adf.test(bias_daily_ts, alternative = "stationary"): p-value smaller
than printed p-value
print(adf_daily_bias)

    Augmented Dickey-Fuller Test

data:  bias_daily_ts
Dickey-Fuller = -4.1456, Lag order = 13, p-value = 0.01
alternative hypothesis: stationary
cat("Interpretation: ")
Interpretation: 
if(adf_daily_bias$p.value < 0.05) {
  cat("Series is STATIONARY (p-value < 0.05)\n")
} else {
  cat("Series is NON-STATIONARY (p-value >= 0.05)\n")
}
Series is STATIONARY (p-value < 0.05)
# ADF Test for stationarity of monthly data

cat("=== ADF Test Results for Monthly Time Series ===\n\n")
=== ADF Test Results for Monthly Time Series ===
# ADF test for monthly sentiment
cat("Monthly Sentiment ADF Test:\n")
Monthly Sentiment ADF Test:
adf_monthly_sentiment <- adf.test(sentiment_monthly_ts, alternative = "stationary")
print(adf_monthly_sentiment)

    Augmented Dickey-Fuller Test

data:  sentiment_monthly_ts
Dickey-Fuller = -1.6999, Lag order = 4, p-value = 0.6985
alternative hypothesis: stationary
cat("Interpretation: ")
Interpretation: 
if(adf_monthly_sentiment$p.value < 0.05) {
  cat("Series is STATIONARY (p-value < 0.05)\n")
} else {
  cat("Series is NON-STATIONARY (p-value >= 0.05)\n")
}
Series is NON-STATIONARY (p-value >= 0.05)
cat("\n" , rep("-", 50), "\n\n")

 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
# ADF test for monthly bias
cat("Monthly Bias ADF Test:\n")
Monthly Bias ADF Test:
adf_monthly_bias <- adf.test(bias_monthly_ts, alternative = "stationary")
print(adf_monthly_bias)

    Augmented Dickey-Fuller Test

data:  bias_monthly_ts
Dickey-Fuller = -2.8844, Lag order = 4, p-value = 0.2138
alternative hypothesis: stationary
cat("Interpretation: ")
Interpretation: 
if(adf_monthly_bias$p.value < 0.05) {
  cat("Series is STATIONARY (p-value < 0.05)\n")
} else {
  cat("Series is NON-STATIONARY (p-value >= 0.05)\n")
}
Series is NON-STATIONARY (p-value >= 0.05)
cat("\n=== Summary of Stationarity Tests ===\n")

=== Summary of Stationarity Tests ===
cat("Daily Sentiment:  ", ifelse(adf_daily_sentiment$p.value < 0.05, "STATIONARY", "NON-STATIONARY"), 
    " (p =", round(adf_daily_sentiment$p.value, 4), ")\n")
Daily Sentiment:   NON-STATIONARY  (p = 0.1746 )
cat("Daily Bias:       ", ifelse(adf_daily_bias$p.value < 0.05, "STATIONARY", "NON-STATIONARY"), 
    " (p =", round(adf_daily_bias$p.value, 4), ")\n")
Daily Bias:        STATIONARY  (p = 0.01 )
cat("Monthly Sentiment:", ifelse(adf_monthly_sentiment$p.value < 0.05, "STATIONARY", "NON-STATIONARY"), 
    " (p =", round(adf_monthly_sentiment$p.value, 4), ")\n")
Monthly Sentiment: NON-STATIONARY  (p = 0.6985 )
cat("Monthly Bias:     ", ifelse(adf_monthly_bias$p.value < 0.05, "STATIONARY", "NON-STATIONARY"), 
    " (p =", round(adf_monthly_bias$p.value, 4), ")\n")
Monthly Bias:      NON-STATIONARY  (p = 0.2138 )

Differencing of time series

# Difference the data
diff_sentiment_daily <- diff(sentiment_daily_ts)

# Plot the differenced data
p1 <- ggplot() + 
  geom_line(aes(x = time(diff_sentiment_daily), y = as.numeric(diff_sentiment_daily)), color = "steelblue") + 
  labs(title = "Differenced Daily Sentiment - USA Combined Categories",
       x = "Time",
       y = "Differenced Sentiment Score") +
  theme_minimal()

# Display the Differenced Plot
p1
Don't know how to automatically pick scale for object of type <ts>. Defaulting
to continuous.

# Check stationarity of differenced series
cat("ADF Test for Differenced Daily Sentiment:\n")
ADF Test for Differenced Daily Sentiment:
adf_diff_daily_sentiment <- adf.test(diff_sentiment_daily, alternative = "stationary")
Warning in adf.test(diff_sentiment_daily, alternative = "stationary"): p-value
smaller than printed p-value
print(adf_diff_daily_sentiment)

    Augmented Dickey-Fuller Test

data:  diff_sentiment_daily
Dickey-Fuller = -10.18, Lag order = 13, p-value = 0.01
alternative hypothesis: stationary
cat("Interpretation: ")
Interpretation: 
if(adf_diff_daily_sentiment$p.value < 0.05) {
  cat("Differenced series is STATIONARY (p-value < 0.05)\n")
} else {
  cat("Differenced series is still NON-STATIONARY (p-value >= 0.05)\n")
}
Differenced series is STATIONARY (p-value < 0.05)
# No need to difference bias data as the series is stationary
# Difference the data
diff_sentiment_monthly <- diff(sentiment_monthly_ts)

# Plot the differenced data
p3 <- ggplot() + 
  geom_line(aes(x = time(diff_sentiment_monthly), y = as.numeric(diff_sentiment_monthly)), color = "darkblue") + 
  labs(title = "Differenced Monthly Sentiment - USA Combined Categories",
       x = "Time",
       y = "Differenced Sentiment Score") +
  theme_minimal()

# Display the Differenced Plot
p3
Don't know how to automatically pick scale for object of type <ts>. Defaulting
to continuous.

# Check stationarity of differenced series
cat("ADF Test for Differenced Monthly Sentiment:\n")
ADF Test for Differenced Monthly Sentiment:
adf_diff_monthly_sentiment <- adf.test(diff_sentiment_monthly, alternative = "stationary")
Warning in adf.test(diff_sentiment_monthly, alternative = "stationary"):
p-value smaller than printed p-value
print(adf_diff_monthly_sentiment)

    Augmented Dickey-Fuller Test

data:  diff_sentiment_monthly
Dickey-Fuller = -4.6235, Lag order = 4, p-value = 0.01
alternative hypothesis: stationary
cat("Interpretation: ")
Interpretation: 
if(adf_diff_monthly_sentiment$p.value < 0.05) {
  cat("Differenced series is STATIONARY (p-value < 0.05)\n")
} else {
  cat("Differenced series is still NON-STATIONARY (p-value >= 0.05)\n")
}
Differenced series is STATIONARY (p-value < 0.05)
# Difference the data
diff_bias_monthly <- diff(bias_monthly_ts)

# Plot the differenced data
p4 <- ggplot() + 
  geom_line(aes(x = time(diff_bias_monthly), y = as.numeric(diff_bias_monthly)), color = "darkred") + 
  labs(title = "Differenced Monthly Bias - USA Combined Categories",
       x = "Time",
       y = "Differenced Bias Score") +
  theme_minimal()

# Display the Differenced Plot
p4
Don't know how to automatically pick scale for object of type <ts>. Defaulting
to continuous.

# Check stationarity of differenced series
cat("ADF Test for Differenced Monthly Bias:\n")
ADF Test for Differenced Monthly Bias:
adf_diff_monthly_bias <- adf.test(diff_bias_monthly, alternative = "stationary")
Warning in adf.test(diff_bias_monthly, alternative = "stationary"): p-value
smaller than printed p-value
print(adf_diff_monthly_bias)

    Augmented Dickey-Fuller Test

data:  diff_bias_monthly
Dickey-Fuller = -5.6506, Lag order = 4, p-value = 0.01
alternative hypothesis: stationary
cat("Interpretation: ")
Interpretation: 
if(adf_diff_monthly_bias$p.value < 0.05) {
  cat("Differenced series is STATIONARY (p-value < 0.05)\n")
} else {
  cat("Differenced series is still NON-STATIONARY (p-value >= 0.05)\n")
}
Differenced series is STATIONARY (p-value < 0.05)
# Summary of differencing results
cat("\n=== Summary of Differencing Results ===\n")

=== Summary of Differencing Results ===
cat("Differenced Daily Sentiment:   ", ifelse(adf_diff_daily_sentiment$p.value < 0.05, "STATIONARY", "NON-STATIONARY"), 
    " (p =", round(adf_diff_daily_sentiment$p.value, 4), ")\n")
Differenced Daily Sentiment:    STATIONARY  (p = 0.01 )
cat("Differenced Monthly Sentiment: ", ifelse(adf_diff_monthly_sentiment$p.value < 0.05, "STATIONARY", "NON-STATIONARY"), 
    " (p =", round(adf_diff_monthly_sentiment$p.value, 4), ")\n")
Differenced Monthly Sentiment:  STATIONARY  (p = 0.01 )
cat("Differenced Monthly Bias:      ", ifelse(adf_diff_monthly_bias$p.value < 0.05, "STATIONARY", "NON-STATIONARY"), 
    " (p =", round(adf_diff_monthly_bias$p.value, 4), ")\n")
Differenced Monthly Bias:       STATIONARY  (p = 0.01 )

ACF Plots of differenced series

# Load patchwork library for combining plots
library(patchwork)

# ACF plot for differenced daily sentiment
acf_daily_sentiment <- ggAcf(diff_sentiment_daily, 50) +
  ggtitle("ACF of Differenced Daily Sentiment") +
  theme_minimal()

# ACF plot for differenced monthly sentiment
acf_monthly_sentiment <- ggAcf(diff_sentiment_monthly, 20) +
  ggtitle("ACF of Differenced Monthly Sentiment") +
  theme_minimal()

# ACF plot for differenced monthly bias
acf_monthly_bias <- ggAcf(diff_bias_monthly, 20) +
  ggtitle("ACF of Differenced Monthly Bias") +
  theme_minimal()

# PACF plot for differenced daily sentiment
pacf_daily_sentiment <- ggPacf(diff_sentiment_daily, 50) +
  ggtitle("PACF of Differenced Daily Sentiment") +
  theme_minimal()

# PACF plot for differenced monthly sentiment
pacf_monthly_sentiment <- ggPacf(diff_sentiment_monthly, 20) +
  ggtitle("PACF of Differenced Monthly Sentiment") +
  theme_minimal()

# PACF plot for differenced monthly bias
pacf_monthly_bias <- ggPacf(diff_bias_monthly, 20) +
  ggtitle("PACF of Differenced Monthly Bias") +
  theme_minimal()
# Display ACF plots
cat("ACF Plots for Differenced Series:\n")
ACF Plots for Differenced Series:
acf_daily_sentiment / acf_monthly_sentiment / acf_monthly_bias

# Display PACF plots
cat("PACF Plots for Differenced Series:\n")
PACF Plots for Differenced Series:
pacf_daily_sentiment / pacf_monthly_sentiment / pacf_monthly_bias

The ACF and PACF plot of the differenced daily sentiment show remaining autocorrelation autocorrelation, hence the need for second order differencing

The ACF and PACF plot of the differenced monthly sentiment and bias show that most autocorrelations fall within the significance bounds after the first few lags

Second order differencing for daily sentiment

# Second-order differencing for daily sentiment
diff2_sentiment_daily <- diff(diff_sentiment_daily)

# Plot the second-order differenced data
p_diff2 <- ggplot() + 
  geom_line(aes(x = time(diff2_sentiment_daily), y = as.numeric(diff2_sentiment_daily)), color = "steelblue") + 
  labs(title = "Second-Order Differenced Daily Sentiment - USA Combined Categories",
       x = "Time",
       y = "Second-Order Differenced Sentiment Score") +
  theme_minimal()

# Display the plot
p_diff2
Don't know how to automatically pick scale for object of type <ts>. Defaulting
to continuous.

# Check stationarity of second-order differenced series
cat("ADF Test for Second-Order Differenced Daily Sentiment:\n")
ADF Test for Second-Order Differenced Daily Sentiment:
adf_diff2_daily_sentiment <- adf.test(diff2_sentiment_daily, alternative = "stationary")
Warning in adf.test(diff2_sentiment_daily, alternative = "stationary"): p-value
smaller than printed p-value
print(adf_diff2_daily_sentiment)

    Augmented Dickey-Fuller Test

data:  diff2_sentiment_daily
Dickey-Fuller = -25.216, Lag order = 13, p-value = 0.01
alternative hypothesis: stationary
cat("Interpretation: ")
Interpretation: 
if(adf_diff2_daily_sentiment$p.value < 0.05) {
  cat("Second-order differenced series is STATIONARY (p-value < 0.05)\n")
} else {
  cat("Second-order differenced series is still NON-STATIONARY (p-value >= 0.05)\n")
}
Second-order differenced series is STATIONARY (p-value < 0.05)
# ACF and PACF plots for second-order differenced daily sentiment

# ACF plot for second-order differenced daily sentiment
acf_diff2_daily_sentiment <- ggAcf(diff2_sentiment_daily, 50) +
  ggtitle("ACF of Second-Order Differenced Daily Sentiment") +
  theme_minimal()

# PACF plot for second-order differenced daily sentiment
pacf_diff2_daily_sentiment <- ggPacf(diff2_sentiment_daily, 50) +
  ggtitle("PACF of Second-Order Differenced Daily Sentiment") +
  theme_minimal()

# Display both plots side by side
cat("ACF and PACF Plots for Second-Order Differenced Daily Sentiment:\n")
ACF and PACF Plots for Second-Order Differenced Daily Sentiment:
acf_diff2_daily_sentiment / pacf_diff2_daily_sentiment

# Comparison of first-order vs second-order differencing ACF plots
cat("Comparison: First-Order vs Second-Order Differencing ACF:\n")
Comparison: First-Order vs Second-Order Differencing ACF:
acf_daily_sentiment / acf_diff2_daily_sentiment

The ACF and PACF of the second order differencing show that the series may be over differenced as evidenced by potential negative correlation patterns

For modeling, we will be predicting monthly sentiment and bias hence the following parameter choices and modeling exercises will be focus on monthly sentiment and bias

Parameter choices - d = 1 since first order differencing is sufficient for stationarity - p (AR order): From the PACF plots, we see significant spikes at lags 1 and 2 so we will test p = 0, 1, 2 - q (MA order) : From the ACF plot, we see a signficant correlation at lag 1 so we will test q = 0,1,2

Parameter search for monthly sentiment

# Model 1: ARIMA (0,1,0)
model1 <- Arima(sentiment_monthly_ts, order = c(0, 1, 0), include.drift = FALSE)
summary(model1)
Series: sentiment_monthly_ts 
ARIMA(0,1,0) 

sigma^2 = 0.02653:  log likelihood = 31.26
AIC=-60.52   AICc=-60.47   BIC=-58.15

Training set error measures:
                       ME     RMSE        MAE MPE MAPE    MASE       ACF1
Training set -0.008521767 0.161873 0.08038175 NaN  Inf 0.42168 -0.3911877
# Model 2: ARIMA (1,1,0)
model2 <- Arima(sentiment_monthly_ts, order = c(1, 1, 0), include.drift = FALSE)
summary(model2)
Series: sentiment_monthly_ts 
ARIMA(1,1,0) 

Coefficients:
          ar1
      -0.4722
s.e.   0.1117

sigma^2 = 0.02188:  log likelihood = 39.25
AIC=-74.5   AICc=-74.34   BIC=-69.76

Training set error measures:
                      ME      RMSE        MAE MPE MAPE      MASE        ACF1
Training set -0.01166505 0.1460697 0.07906747 NaN  Inf 0.4147853 -0.04152011
# Model 3: ARIMA (2,1,0)
model3 <- Arima(sentiment_monthly_ts, order = c(2, 1, 0), include.drift = FALSE)
summary(model3)
Series: sentiment_monthly_ts 
ARIMA(2,1,0) 

Coefficients:
          ar1      ar2
      -0.5970  -0.3040
s.e.   0.1181   0.1181

sigma^2 = 0.02042:  log likelihood = 42.41
AIC=-78.83   AICc=-78.51   BIC=-71.72

Training set error measures:
                      ME      RMSE        MAE MPE MAPE      MASE       ACF1
Training set -0.01394514 0.1401805 0.07923248 NaN  Inf 0.4156509 0.06257796
# Model 4: ARIMA (0,1,1)
model4 <- Arima(sentiment_monthly_ts, order = c(0, 1, 1), include.drift = FALSE)
summary(model4)
Series: sentiment_monthly_ts 
ARIMA(0,1,1) 

Coefficients:
          ma1
      -0.5574
s.e.   0.0939

sigma^2 = 0.02054:  log likelihood = 41.69
AIC=-79.37   AICc=-79.21   BIC=-74.63

Training set error measures:
                      ME      RMSE        MAE MPE MAPE      MASE       ACF1
Training set -0.01578316 0.1415283 0.07938302 NaN  Inf 0.4164407 0.03344299
# Model 5: ARIMA (0,1,2)
model5 <- Arima(sentiment_monthly_ts, order = c(0, 1, 2), include.drift = FALSE)
summary(model5)
Series: sentiment_monthly_ts 
ARIMA(0,1,2) 

Coefficients:
          ma1     ma2
      -0.6394  0.1344
s.e.   0.1281  0.1363

sigma^2 = 0.02056:  log likelihood = 42.15
AIC=-78.29   AICc=-77.97   BIC=-71.19

Training set error measures:
                      ME      RMSE        MAE MPE MAPE      MASE      ACF1
Training set -0.01450894 0.1406629 0.08036869 NaN  Inf 0.4216115 0.1006025
# Model 6: ARIMA (1,1,1)
model6 <- Arima(sentiment_monthly_ts, order = c(1, 1, 1), include.drift = FALSE)
summary(model6)
Series: sentiment_monthly_ts 
ARIMA(1,1,1) 

Coefficients:
          ar1      ma1
      -0.1549  -0.4603
s.e.   0.1931   0.1628

sigma^2 = 0.02065:  log likelihood = 41.99
AIC=-77.97   AICc=-77.65   BIC=-70.87

Training set error measures:
                      ME      RMSE        MAE MPE MAPE      MASE       ACF1
Training set -0.01512532 0.1409653 0.07995176 NaN  Inf 0.4194242 0.08095608
# Model 7: ARIMA (1,1,2)
model7 <- Arima(sentiment_monthly_ts, order = c(1, 1, 2), include.drift = FALSE)
summary(model7)
Series: sentiment_monthly_ts 
ARIMA(1,1,2) 

Coefficients:
         ar1      ma1     ma2
      0.3016  -0.9289  0.3052
s.e.  0.5271   0.4912  0.2762

sigma^2 = 0.02075:  log likelihood = 42.29
AIC=-76.59   AICc=-76.05   BIC=-67.11

Training set error measures:
                      ME     RMSE        MAE MPE MAPE      MASE      ACF1
Training set -0.01372643 0.140405 0.08038007 NaN  Inf 0.4216711 0.0928004
# Model 8: ARIMA (2,1,2)
model8 <- Arima(sentiment_monthly_ts, order = c(2, 1, 2), include.drift = FALSE)
summary(model8)
Series: sentiment_monthly_ts 
ARIMA(2,1,2) 

Coefficients:
          ar1      ar2      ma1     ma2
      -0.1234  -0.2373  -0.4890  0.2144
s.e.   0.8119   0.2384   0.8094  0.4391

sigma^2 = 0.02085:  log likelihood = 42.62
AIC=-75.23   AICc=-74.41   BIC=-63.39

Training set error measures:
                      ME      RMSE        MAE MPE MAPE      MASE     ACF1
Training set -0.01382598 0.1398063 0.07972581 NaN  Inf 0.4182389 0.079746

Model 1 : ARIMA (0,1,0) has the lowest AIC indicating to be the best model according to this manual selection process

Loop based approach: Breaking terminal in VS Code

# Parameter search for monthly sentiment

# Load necessary libraries
library(knitr)

# Define parameter ranges based on ACF/PACF analysis
p_range <- 0:2  # AR order: test p = 0, 1, 2 based on PACF analysis
d <- 1          # Differencing: d = 1 since first order differencing is sufficient
q_range <- 0:2  # MA order: test q = 0, 1, 2 based on ACF analysis

# Calculate total combinations
n_combinations <- length(p_range) * length(q_range)
cat("Testing", n_combinations, "ARIMA model combinations for monthly sentiment...\n\n")
Testing 9 ARIMA model combinations for monthly sentiment...
# Create an empty matrix to store results
results_matrix <- matrix(NA, nrow = n_combinations, ncol = 6)
# Initialize index for matrix row
i <- 1

cat("Starting ARIMA parameter search...\n")
Starting ARIMA parameter search...
# Loop through combinations of ARIMA model parameters
for (p in p_range) {
  for (q in q_range) {
    cat("Testing ARIMA(", p, ",", d, ",", q, ")... ")
    
    # Try fit ARIMA model with comprehensive error handling
    tryCatch({
      # Check if the data has enough observations
      if(length(sentiment_monthly_ts) < (p + q + d + 1)) {
        cat("INSUFFICIENT DATA\n")
        results_matrix[i, ] <- c(p, d, q, NA, NA, NA)
      } else {
        # Fit ARIMA model with specified (p,d,q) for monthly sentiment
        model <- Arima(sentiment_monthly_ts, order = c(p, d, q), include.drift = FALSE)
        
        # Store model parameters and AIC/BIC/AICc values in matrix
        results_matrix[i, ] <- c(p, d, q, model$aic, model$bic, model$aicc)
        
        # Print success
        cat("SUCCESS - AIC:", round(model$aic, 3), "\n")
      }
      
    }, warning = function(w) {
      cat("WARNING:", w$message, "\n")
      results_matrix[i, ] <- c(p, d, q, NA, NA, NA)
    }, error = function(e) {
      # If model fails to fit, store NA values and print error
      cat("FAILED -", e$message, "\n")
      results_matrix[i, ] <- c(p, d, q, NA, NA, NA)
    })

    # Increment row index
    i <- i + 1
    
    # Add a small delay to prevent overwhelming the system
    Sys.sleep(0.1)
  }
}
Testing ARIMA( 0 , 1 , 0 )... SUCCESS - AIC: -60.523 
Testing ARIMA( 0 , 1 , 1 )... SUCCESS - AIC: -79.372 
Testing ARIMA( 0 , 1 , 2 )... SUCCESS - AIC: -78.295 
Testing ARIMA( 1 , 1 , 0 )... SUCCESS - AIC: -74.502 
Testing ARIMA( 1 , 1 , 1 )... SUCCESS - AIC: -77.974 
Testing ARIMA( 1 , 1 , 2 )... SUCCESS - AIC: -76.588 
Testing ARIMA( 2 , 1 , 0 )... SUCCESS - AIC: -78.827 
Testing ARIMA( 2 , 1 , 1 )... SUCCESS - AIC: -77.034 
Testing ARIMA( 2 , 1 , 2 )... SUCCESS - AIC: -75.234 
cat("\nParameter search completed.\n")

Parameter search completed.
# Convert matrix to data frame
results_df <- as.data.frame(results_matrix)
colnames(results_df) <- c("p", "d", "q", "AIC", "BIC", "AICc")

# Remove rows with NA values 
results_df_clean <- results_df[complete.cases(results_df), ]

cat("Successfully fitted models:", nrow(results_df_clean), "out of", nrow(results_df), "\n")
Successfully fitted models: 9 out of 9 
if(nrow(results_df_clean) > 0) {
  # Find the row with the lowest AIC, BIC, and AICc
  best_aic_row <- which.min(results_df_clean$AIC)
  best_bic_row <- which.min(results_df_clean$BIC)
  best_aicc_row <- which.min(results_df_clean$AICc)
  
  # Display summary of best models
  cat("\n=== Best Model Selection Results ===\n")
  cat("Best model by AIC: ARIMA(", results_df_clean[best_aic_row, "p"], ",", 
      results_df_clean[best_aic_row, "d"], ",", results_df_clean[best_aic_row, "q"], 
      ") - AIC =", round(results_df_clean[best_aic_row, "AIC"], 3), "\n")
  cat("Best model by BIC: ARIMA(", results_df_clean[best_bic_row, "p"], ",", 
      results_df_clean[best_bic_row, "d"], ",", results_df_clean[best_bic_row, "q"], 
      ") - BIC =", round(results_df_clean[best_bic_row, "BIC"], 3), "\n")
  cat("Best model by AICc: ARIMA(", results_df_clean[best_aicc_row, "p"], ",", 
      results_df_clean[best_aicc_row, "d"], ",", results_df_clean[best_aicc_row, "q"], 
      ") - AICc =", round(results_df_clean[best_aicc_row, "AICc"], 3), "\n")
  
  # Display the results table with formatting
  cat("\n")
  knitr::kable(results_df_clean, 
               align = 'c', 
               caption = "ARIMA Model Comparison for Monthly Sentiment",
               digits = 3)
} else {
  cat("ERROR: No models were successfully fitted. Check your data.\n")
  print(summary(sentiment_monthly_ts))
}

=== Best Model Selection Results ===
Best model by AIC: ARIMA( 0 , 1 , 1 ) - AIC = -79.372 
Best model by BIC: ARIMA( 0 , 1 , 1 ) - BIC = -74.633 
Best model by AICc: ARIMA( 0 , 1 , 1 ) - AICc = -79.214 
Warning: 'xfun::attr()' is deprecated.
Use 'xfun::attr2()' instead.
See help("Deprecated")

Warning: 'xfun::attr()' is deprecated.
Use 'xfun::attr2()' instead.
See help("Deprecated")
ARIMA Model Comparison for Monthly Sentiment
p d q AIC BIC AICc
0 1 0 -60.523 -58.154 -60.471
0 1 1 -79.372 -74.633 -79.214
0 1 2 -78.295 -71.186 -77.975
1 1 0 -74.502 -69.763 -74.344
1 1 1 -77.974 -70.866 -77.654
1 1 2 -76.588 -67.110 -76.047
2 1 0 -78.827 -71.719 -78.507
2 1 1 -77.034 -67.556 -76.493
2 1 2 -75.234 -63.387 -74.412

Auto selection of best model

auto_sentiment_monthly.model <- auto.arima(sentiment_monthly_ts, seasonal = FALSE)
cat("Auto-selected model:", auto_sentiment_monthly.model$arma[1], auto_sentiment_monthly.model$arma[6], auto_sentiment_monthly.model$arma[2])
Auto-selected model: 1 0 1
summary(auto_sentiment_monthly.model)
Series: sentiment_monthly_ts 
ARIMA(1,0,1) with zero mean 

Coefficients:
         ar1      ma1
      0.9424  -0.5327
s.e.  0.0421   0.1056

sigma^2 = 0.02028:  log likelihood = 42.86
AIC=-79.72   AICc=-79.41   BIC=-72.58

Training set error measures:
                      ME      RMSE        MAE MPE MAPE      MASE        ACF1
Training set 0.004185935 0.1406097 0.07172981 NaN  Inf 0.3762922 -0.09366502

The auto selected model is ARIMA(1,0,1)

Model diagnostics to get best best model

Manual model:

model_manual_sentiment <- capture.output(sarima(sentiment_monthly_ts, 0,1,0))

  • The residual plot show consistent fluctuation around zero and but no constant variation which shows that the model might need improvement slightly
  • The ACF plot does not reveal any significant correlations
  • The Q-Q plot indicates that the residuals follow a near-normal distribution, with minor deviations at the tails, which is typical in time series data
  • The Ljung-Box Test p-values remain mostly above the 0.05 significance level, implying no significant autocorrelations left in the residuals and concluding that the model is adequate
model_auto_sentiment <- capture.output(sarima(sentiment_monthly_ts, 1,0,1))

  • The residual plot does not show consistent fluctuation around zero and no constant variation which shows that the model might need improvement
  • The ACF plot does not reveal any significant correlations
  • The Q-Q plot indicates that the residuals follow a near-normal distribution, with major deviations at the tails,
  • The Ljung-Box Test p-values remain mostly above the 0.05 significance level, implying no significant autocorrelations left in the residuals and concluding that the model might be adequate

Both models perform similarly, with the residual, acf, and normal q-q plot almost identical. The only different plots are the Ljung-Box statistic plot with the automatic plot producing p-values that are significantly > 0.05

Benchmark methods and models for sentiment

# Define model orders for comparison
manual_order <- c(0, 1, 0)
auto_order <- c(1, 0, 1)
# Split data for evaluation (use last 6 observations for testing)
n <- length(sentiment_monthly_ts)
train_data <- sentiment_monthly_ts[1:(n-6)]
test_data <- sentiment_monthly_ts[(n-5):n]

# Fit manual model on training data
manual_arima <- Arima(train_data, order = manual_order, include.drift = FALSE)
manual_forecast <- forecast(manual_arima, h = 6)

# Fit automatic model on training data
auto_arima <- Arima(train_data, order = auto_order, include.drift = FALSE)
auto_forecast <- forecast(auto_arima, h = 6)
# Benchmark methods
# Naive method
naive_forecast <- naive(train_data, h = 6)

# Mean method  
mean_forecast <- meanf(train_data, h = 6)

# Random walk with drift
rwf_forecast <- rwf(train_data, h = 6, drift = TRUE)

# Calculate accuracy measures
manual_accuracy <- accuracy(manual_forecast, test_data)
auto_accuracy <- accuracy(auto_forecast, test_data)
naive_accuracy <- accuracy(naive_forecast, test_data)
mean_accuracy <- accuracy(mean_forecast, test_data)
rwf_accuracy <- accuracy(rwf_forecast, test_data)

# Create comparison table
comparison <- data.frame(
  Method = c("Manual ARIMA", "Auto ARIMA", "Naive", "Mean", "RWF with Drift"),
  MAE = c(manual_accuracy[2,3], auto_accuracy[2,3], naive_accuracy[2,3], mean_accuracy[2,3], rwf_accuracy[2,3]),
  RMSE = c(manual_accuracy[2,2], auto_accuracy[2,2], naive_accuracy[2,2], mean_accuracy[2,2], rwf_accuracy[2,2]),
  MAPE = c(manual_accuracy[2,5], auto_accuracy[2,5], naive_accuracy[2,5], mean_accuracy[2,5], rwf_accuracy[2,5])
)
knitr::kable(comparison, digits = 3, caption = "Model Performance Comparison")
Warning: 'xfun::attr()' is deprecated.
Use 'xfun::attr2()' instead.
See help("Deprecated")

Warning: 'xfun::attr()' is deprecated.
Use 'xfun::attr2()' instead.
See help("Deprecated")
Model Performance Comparison
Method MAE RMSE MAPE
Manual ARIMA 0.025 0.027 52.577
Auto ARIMA 0.075 0.080 167.259
Naive 0.025 0.027 52.577
Mean 0.155 0.156 403.982
RWF with Drift 0.009 0.014 28.218

The random walk with drift has a lower MAE, RMSE, and MAPE, outperforming both the Manual ARIMA and Auto Arima

Plot forecasts with manual arima as chosen model to include with baseline models as it outperforms auto arima

# Forecast visualization

# Fit the best manual model on full data
best_manual_fit <- Arima(sentiment_monthly_ts, order = manual_order, include.drift = FALSE)

# Create forecast plot comparing all methods
autoplot(sentiment_monthly_ts) +
  autolayer(meanf(sentiment_monthly_ts, h = 60), series = "Mean", PI = FALSE) +
  autolayer(naive(sentiment_monthly_ts, h = 60), series = "Naïve", PI = FALSE) +
  autolayer(rwf(sentiment_monthly_ts, drift = TRUE, h = 60), series = "RWF with Drift", PI = FALSE) +
  autolayer(forecast(best_manual_fit, h = 60), series = "Manual ARIMA", PI = FALSE) +
  ggtitle("Sentiment Forecast Comparison") +
  xlab("Time") + ylab("Sentiment (health, science, and general category)") +
  guides(colour = guide_legend(title = "Forecast Methods")) +
  theme_minimal()

Parameter search for monthly bias

# Model 1: ARIMA (0,1,0)
model1 <- Arima(bias_monthly_ts, order = c(0, 1, 0), include.drift = FALSE)
summary(model1)
Series: bias_monthly_ts 
ARIMA(0,1,0) 

sigma^2 = 0.1704:  log likelihood = -42.19
AIC=86.39   AICc=86.44   BIC=88.76

Training set error measures:
                      ME      RMSE       MAE       MPE     MAPE      MASE
Training set -0.01140549 0.4101878 0.2138057 -188.5115 482.6352 0.6365364
                   ACF1
Training set -0.5095393
# Model 2: ARIMA (1,1,0)
model2 <- Arima(bias_monthly_ts, order = c(1, 1, 0), include.drift = FALSE)
summary(model2)
Series: bias_monthly_ts 
ARIMA(1,1,0) 

Coefficients:
          ar1
      -0.5997
s.e.   0.1007

sigma^2 = 0.1191:  log likelihood = -27.78
AIC=59.55   AICc=59.71   BIC=64.29

Training set error measures:
                      ME      RMSE       MAE       MPE     MAPE      MASE
Training set -0.01460045 0.3408022 0.1978034 -78.00665 276.4834 0.5888948
                   ACF1
Training set -0.1270015
# Model 3: ARIMA (2,1,0)
model3 <- Arima(bias_monthly_ts, order = c(2, 1, 0), include.drift = FALSE)
summary(model3)
Series: bias_monthly_ts 
ARIMA(2,1,0) 

Coefficients:
          ar1      ar2
      -0.7991  -0.4049
s.e.   0.1088   0.1120

sigma^2 = 0.1033:  log likelihood = -21.79
AIC=49.58   AICc=49.9   BIC=56.69

Training set error measures:
                      ME      RMSE       MAE      MPE     MAPE      MASE
Training set -0.01659485 0.3153254 0.1989953 131.0169 297.7831 0.5924432
                   ACF1
Training set 0.02386103
# Model 4: ARIMA (0,1,1)
model4 <- Arima(bias_monthly_ts, order = c(0, 1, 1), include.drift = FALSE)
summary(model4)
Series: bias_monthly_ts 
ARIMA(0,1,1) 

Coefficients:
          ma1
      -0.7562
s.e.   0.0827

sigma^2 = 0.1034:  log likelihood = -22.4
AIC=48.8   AICc=48.96   BIC=53.54

Training set error measures:
                      ME      RMSE       MAE      MPE     MAPE      MASE
Training set -0.02437428 0.3175789 0.1949152 81.01677 183.9622 0.5802961
                    ACF1
Training set -0.03081449
# Model 5: ARIMA (0,1,2)
model5 <- Arima(bias_monthly_ts, order = c(0, 1, 2), include.drift = FALSE)
summary(model5)
Series: bias_monthly_ts 
ARIMA(0,1,2) 

Coefficients:
          ma1     ma2
      -0.8414  0.1776
s.e.   0.1056  0.1283

sigma^2 = 0.1025:  log likelihood = -21.5
AIC=49   AICc=49.32   BIC=56.11

Training set error measures:
                      ME      RMSE       MAE     MPE     MAPE      MASE
Training set -0.01927906 0.3141134 0.1921457 103.312 227.0911 0.5720508
                   ACF1
Training set 0.05863614
# Model 6: ARIMA (1,1,1)
model6 <- Arima(bias_monthly_ts, order = c(1, 1, 1), include.drift = FALSE)
summary(model6)
Series: bias_monthly_ts 
ARIMA(1,1,1) 

Coefficients:
          ar1      ma1
      -0.2406  -0.6166
s.e.   0.1679   0.1422

sigma^2 = 0.1024:  log likelihood = -21.47
AIC=48.93   AICc=49.25   BIC=56.04

Training set error measures:
                      ME      RMSE       MAE      MPE     MAPE      MASE
Training set -0.01998001 0.3139071 0.1927771 103.1747 226.0265 0.5739305
                   ACF1
Training set 0.06668672
# Model 7: ARIMA (1,1,2)
model7 <- Arima(bias_monthly_ts, order = c(1, 1, 2), include.drift = FALSE)
summary(model7)
Series: bias_monthly_ts 
ARIMA(1,1,2) 

Coefficients:
          ar1     ma1      ma2
      -0.9871  0.2063  -0.6736
s.e.   0.0224  0.0973   0.0949

sigma^2 = 0.09993:  log likelihood = -20.31
AIC=48.62   AICc=49.16   BIC=58.1

Training set error measures:
                      ME     RMSE       MAE      MPE     MAPE      MASE
Training set -0.02120117 0.308111 0.2029017 9.052154 214.1556 0.6040732
                   ACF1
Training set 0.03457281
# Model 8: ARIMA (2,1,2)
model8 <- Arima(bias_monthly_ts, order = c(2, 1, 2), include.drift = FALSE)
summary(model8)
Series: bias_monthly_ts 
ARIMA(2,1,2) 

Coefficients:
          ar1      ar2      ma1      ma2
      -0.5656  -0.2268  -0.2869  -0.0592
s.e.   0.4246   0.2291   0.4205   0.2984

sigma^2 = 0.1044:  log likelihood = -21.18
AIC=52.36   AICc=53.18   BIC=64.21

Training set error measures:
                      ME      RMSE       MAE     MPE     MAPE      MASE
Training set -0.01835702 0.3128212 0.1953215 127.972 275.8102 0.5815057
                   ACF1
Training set 0.07124832

Model 1 : ARIMA (1,1,2) has the lowest AIC indicating to be the best model for bias according to this manual selection process

Auto selection of best model

auto_bias_monthly.model <- auto.arima(bias_monthly_ts, seasonal = FALSE)
cat("Auto-selected model:", auto_bias_monthly.model$arma[1], auto_bias_monthly.model$arma[6], auto_bias_monthly.model$arma[2])
Auto-selected model: 4 0 1
summary(auto_bias_monthly.model)
Series: bias_monthly_ts 
ARIMA(4,0,1) with zero mean 

Coefficients:
          ar1     ar2     ar3     ar4     ma1
      -0.7682  0.3098  0.4889  0.3909  0.8927
s.e.   0.1176  0.1252  0.1266  0.1098  0.0672

sigma^2 = 0.08966:  log likelihood = -15.27
AIC=42.54   AICc=43.7   BIC=56.84

Training set error measures:
                     ME      RMSE       MAE      MPE     MAPE      MASE
Training set 0.03253763 0.2899303 0.1999279 55.82998 278.6028 0.5952198
                 ACF1
Training set 0.031864

The auto selected model is ARIMA(4,0,1)

Model diagnostics to get best best model

Manual model:

model_manual_bias <- capture.output(sarima(bias_monthly_ts, 1,1,2))

  • The residual plot does not show consistent fluctuation around zero and no constant variation which shows that the model might need improvement
  • The ACF plot does not reveal any significant correlations
  • The Q-Q plot indicates that the residuals follow a slightly near-normal distribution, with major deviations at the tails,
  • The Ljung-Box Test p-values remain mostly above the 0.05 significance level, implying no significant autocorrelations left in the residuals and concluding that the model is adequate

Auto model:

model_auto_bias <- capture.output(sarima(bias_monthly_ts, 4,0,1))

  • The residual plot does not show consistent fluctuation around zero and no constant variation which shows that the model might need improvement
  • The ACF plot does not reveal any significant correlations
  • The Q-Q plot indicates that the residuals follow a near-normal distribution, with minor deviations at the tails,
  • The Ljung-Box Test p-values remain mostly above the 0.05 significance level, implying no significant autocorrelations left in the residuals and concluding that the model might be adequate

Both models perform similarly, with the residual, acf, and normal q-q plot very similar. The Ljung-Box statistic plot with the automatic plot producing p-values that are significantly > 0.05

Benchmark methods and models for bias

# Define model orders for comparison
manual_order <- c(1, 1, 2)
auto_order <- c(4, 0, 1)
# Split data for evaluation (use last 6 observations for testing)
n <- length(bias_monthly_ts)
train_data <- bias_monthly_ts[1:(n-6)]
test_data <- bias_monthly_ts[(n-5):n]

# Fit manual model on training data
manual_arima <- Arima(train_data, order = manual_order, include.drift = FALSE)
manual_forecast <- forecast(manual_arima, h = 6)

# Fit automatic model on training data
auto_arima <- Arima(train_data, order = auto_order, include.drift = FALSE)
auto_forecast <- forecast(auto_arima, h = 6)
# Benchmark methods
# Naive method
naive_forecast <- naive(train_data, h = 6)

# Mean method  
mean_forecast <- meanf(train_data, h = 6)

# Random walk with drift
rwf_forecast <- rwf(train_data, h = 6, drift = TRUE)

# Calculate accuracy measures
manual_accuracy <- accuracy(manual_forecast, test_data)
auto_accuracy <- accuracy(auto_forecast, test_data)
naive_accuracy <- accuracy(naive_forecast, test_data)
mean_accuracy <- accuracy(mean_forecast, test_data)
rwf_accuracy <- accuracy(rwf_forecast, test_data)

# Create comparison table
comparison <- data.frame(
  Method = c("Manual ARIMA", "Auto ARIMA", "Naive", "Mean", "RWF with Drift"),
  MAE = c(manual_accuracy[2,3], auto_accuracy[2,3], naive_accuracy[2,3], mean_accuracy[2,3], rwf_accuracy[2,3]),
  RMSE = c(manual_accuracy[2,2], auto_accuracy[2,2], naive_accuracy[2,2], mean_accuracy[2,2], rwf_accuracy[2,2]),
  MAPE = c(manual_accuracy[2,5], auto_accuracy[2,5], naive_accuracy[2,5], mean_accuracy[2,5], rwf_accuracy[2,5])
)
knitr::kable(comparison, digits = 3, caption = "Model Performance Comparison")
Warning: 'xfun::attr()' is deprecated.
Use 'xfun::attr2()' instead.
See help("Deprecated")

Warning: 'xfun::attr()' is deprecated.
Use 'xfun::attr2()' instead.
See help("Deprecated")
Model Performance Comparison
Method MAE RMSE MAPE
Manual ARIMA 0.024 0.029 40.491
Auto ARIMA 0.032 0.037 73.795
Naive 0.012 0.016 29.161
Mean 0.063 0.065 137.971
RWF with Drift 0.040 0.047 71.534

The Naive model has a lower MAE, RMSE, and MAPE, outperforming both the Manual ARIMA and Auto Arima

Plot forecasts with manual arima as chosen model to include with baseline models as it outperforms auto arima

# Forecast visualization

# Fit the best manual model on full data
best_manual_fit <- Arima(bias_monthly_ts, order = manual_order, include.drift = FALSE)

# Create forecast plot comparing all methods
autoplot(bias_monthly_ts) +
  autolayer(meanf(bias_monthly_ts, h = 60), series = "Mean", PI = FALSE) +
  autolayer(naive(bias_monthly_ts, h = 60), series = "Naïve", PI = FALSE) +
  autolayer(rwf(bias_monthly_ts, drift = TRUE, h = 60), series = "RWF with Drift", PI = FALSE) +
  autolayer(forecast(best_manual_fit, h = 60), series = "Manual ARIMA", PI = FALSE) +
  ggtitle("Bias Forecast Comparison") +
  xlab("Time") + ylab("Bias (health, science, and general category)") +
  guides(colour = guide_legend(title = "Forecast Methods")) +
  theme_minimal()